import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import altair as alt
# set the default renderer to vega
alt.data_transformers.enable("vegafusion")
# Set display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format
# read from a csv file into a pd dataframe
df_ranking = pd.read_csv('~/Downloads/bi_ranking.csv', low_memory=False)
# the companias string
df_ids = pd.read_csv('~/Downloads/bi_compania.csv', low_memory=False)
# segementos
df_segmentos = pd.read_csv('~/Downloads/bi_segmento.csv', low_memory=False)
# Código de Clasificacón Industrial Internacional Unifrome
df_ciiu = pd.read_csv('~/Downloads/bi_ciiu.csv', low_memory=False)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[1], line 13
10 pd.options.display.float_format = '{:,.2f}'.format
12 # read from a csv file into a pd dataframe
---> 13 df_ranking = pd.read_csv('~/Downloads/bi_ranking.csv', low_memory=False)
14 # the companias string
15 df_ids = pd.read_csv('~/Downloads/bi_compania.csv', low_memory=False)
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File ~/.local/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if ioargs.encoding and "b" not in ioargs.mode:
872 # Encoding
--> 873 handle = open(
874 handle,
875 ioargs.mode,
876 encoding=ioargs.encoding,
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: '/home/terac/Downloads/bi_ranking.csv'
Let’s only get the year 2023 and make the passivos by subtracting the patrimonio from the activos#
let’s merge all of the df into one#
# get the anio with 2023
df_ranking = df_ranking[df_ranking['anio'] == 2023]
# rename the cuii column
df_ciiu = df_ciiu.rename(columns={'descripcion': 'ciiu_desc', 'ciiu': 'ciiu_code'})
df_ciiu['ciiu_code'] = df_ciiu['ciiu_code'].str.strip()
# match all of the expedientes in the df_ids with the expedientes in the df_ranking
df = pd.merge(df_ranking, df_ids, on='expediente', how='left')
# let's merge the ciiu and the segments with the
df = pd.merge(df, df_ciiu, left_on='ciiu_n1', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n1_code', 'ciiu_desc': 'ciiu_n1_desc'}, inplace=True)
df = pd.merge(df, df_ciiu, left_on='ciiu_n6', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n6_code', 'ciiu_desc': 'ciiu_n6_desc'}, inplace=True)
# get the passivos by subtracting patrimonio from activos
df['pasivos'] = df['activos'] - df['patrimonio']
# check that there are error where activos is not equal to passivos + patrimonio
#df_error = df[df['activos'] != df['pasivos'] + df['patrimonio']]
# get the name and the passivos columns only
#df_error = df_error[['nombre', 'activos', 'pasivos', 'patrimonio']]
#df_error['margin'] = df_error['activos'] - df_error['pasivos'] - df_error['patrimonio']
# df print nu
#df_error.head()
print(df.columns)
Index(['anio', 'expediente', 'posicion_general', 'cia_imvalores',
'id_estado_financiero', 'ingresos_ventas', 'activos', 'patrimonio',
'utilidad_an_imp', 'impuesto_renta', 'n_empleados', 'ingresos_totales',
'utilidad_ejercicio', 'utilidad_neta', 'cod_segmento', 'ciiu_n1',
'ciiu_n6', 'liquidez_corriente', 'prueba_acida', 'end_activo',
'end_patrimonial', 'end_activo_fijo', 'end_corto_plazo',
'end_largo_plazo', 'cobertura_interes', 'apalancamiento',
'apalancamiento_financiero', 'end_patrimonial_ct',
'end_patrimonial_nct', 'apalancamiento_c_l_plazo', 'rot_cartera',
'rot_activo_fijo', 'rot_ventas', 'per_med_cobranza', 'per_med_pago',
'impac_gasto_a_v', 'impac_carga_finan', 'rent_neta_activo',
'margen_bruto', 'margen_operacional', 'rent_neta_ventas',
'rent_ope_patrimonio', 'rent_ope_activo', 'roe', 'roa',
'fortaleza_patrimonial', 'gastos_financieros', 'gastos_admin_ventas',
'depreciaciones', 'amortizaciones', 'costos_ventas_prod', 'deuda_total',
'deuda_total_c_plazo', 'total_gastos', 'ruc', 'nombre', 'tipo',
'pro_codigo', 'provincia', 'ciiu_n1_code', 'ciiu_n1_desc',
'ciiu_n6_code', 'ciiu_n6_desc', 'pasivos'],
dtype='object')
let seperate the companies into categoris by size#
# make companies categories between 0 and 2.5 billion in assets
# small companies: 0 - 500 thousand
# medium companies: 500 thousand - 50 million
# large companies: 50 million - up
df_small = df[df['activos'] < 500000]
df_medium = df[(df['activos'] >= 500000) & (df['activos'] < 50000000)]
df_large = df[(df['activos'] >= 50000000)]
# get the number of companies in each category and make a pie chart
pie_sizes_df = pd.DataFrame({
'size': ['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
'count': [df_small.shape[0], df_medium.shape[0], df_large.shape[0]]
})
pie_df['percentage'] = (pie_df['count'] / pie_df['count'].sum()) * 100
# Define custom colors for each category using named colors
color_scale = alt.Scale(domain=['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
range=['steelblue', 'orange', 'mediumseagreen' ])
pie_df.head()
pie_chart = alt.Chart(pie_df).mark_arc().encode(
theta=alt.Theta(field="count", type="quantitative"),
color=alt.Color(field="size", type="nominal", scale=color_scale),
tooltip=['size', 'count', alt.Tooltip('percentage:Q', format='.2f')]
).interactive()
pie_chart.display()
type_counts = df['tipo'].value_counts().reset_index()
type_counts.columns = ['tipo', 'count']
type_counts['percentage'] = (type_counts['count'] / type_counts['count'].sum()) * 100
alt.Chart(type_counts).mark_arc().encode(
theta=alt.Theta(field="count", type="quantitative"),
color=alt.Color(field="tipo", type="nominal"),
tooltip=['tipo', 'count', alt.Tooltip('percentage:Q', format='.2f')]
# show percentge in legentd
).interactive().display()
def make_chart(df, title='Activos de Companias', max_bins=100, color='steelblue', width=450, height=300):
# Create a selection
click = alt.selection_point(encodings=['x'])
# Create the right bar chart
bars = alt.Chart(df).mark_bar().encode(
x=alt.X('activos:Q', bin=alt.Bin(maxbins=max_bins), title='Activos'),
y=alt.Y( 'count()', title='Numero de companias'),
color=alt.condition(click, alt.ColorValue(color), alt.ColorValue(color)),
tooltip=['count():Q']
).add_params(
click
).properties(
width=width,
height=height,
title=title,
).interactive()
# Create the left chart (scatter plot)
scatter = alt.Chart(df).mark_circle().encode(
x='pasivos:Q',
y='activos:Q',
color='ciiu_n1_desc:N',
tooltip=['nombre:N', 'activos:Q', 'pasivos:Q', 'patrimonio:Q', 'ciiu_n1_desc:N']
).transform_filter(
click
).properties(
width=width,
height=height,
title=''
).interactive()
# Filter the left chart based on selection
filtered_scatter = scatter.transform_filter(
click
)
# Combine the charts
chart = alt.hconcat(bars, filtered_scatter).resolve_legend(
color="independent",
)
chart.show()
make_chart(df_small, title='Activos de Companias pequenas', color='steelblue')